package com.kdtech.analyse.news;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.analyse.JSoupUtils;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.NumberUtils;

/**
 * www.dayoo.com 大洋网
 * @author Administrator
 * abc
 */
public class DayooNewsAnalyse implements AnalyseNews {

	public boolean isDetailPage(String url) {
		boolean tf=false;
		String[] regex={
				"http://news.dayoo.com/",
				"http://news.dayoo.com/world/[0-9]{6}/[0-9]{2}/[0-9]{5}_[0-9]{8}.htm",
				"http://news.dayoo.com/finance/[0-9]*/[0-9]*/[0-9]*_[0-9]*.htm",
				"http://news.dayoo.com/china/[0-9]*/[0-9]*/[0-9]*_[0-9]*.htm",
				"http://news.dayoo.com/paihang/[0-9]*/[0-9]*/[0-9]*_[0-9]*.htm",
				"http://news.dayoo.com/guangzhou/[0-9]*/[0-9]*/[0-9]*_[0-9]*.htm",
				"http://news.dayoo.com/guangdong/[0-9]*/[0-9]*/[0-9]*_[0-9]*.htm",
				"http://news.dayoo.com/society/[0-9]*/[0-9]*/[0-9]*_[0-9]*.htm",
				"http://news.dayoo.com/ent/[0-9]*/[0-9]*/[0-9]*_[0-9]*.htm"};
		for (int i=0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				tf=true;
				break;
			}
		}
		return tf;
	}
	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
			NewsMeta news=new NewsMeta();
			String htmltxt=urlMeta.getHtml();

			String url=urlMeta.getUrl();

			String title=null;
			String content=null;
			Long date=null;
			String commnetNum=null;
			String clickNum=null;
			Document doc=Jsoup.parse(htmltxt);

			title=doc.select("title").text();
			title=StringUtils.substringBefore(title, "_");

			date=JSoupUtils.matchDate(doc, "来源:");

			doc.select(".relate,.sidebar").remove();
			content=HtmlCleaner.getContentHtml(url,doc.select("div#text_content"));
			String author= JSoupUtils.matchAuthor(doc, "来源:");
			news.setUrl(url);
			news.setType(1);
			news.setTitle(title);
			news.setCommentNum(NumberUtils.parseInt(commnetNum));
			news.setClickNum(NumberUtils.parseInt(clickNum));
			news.setContent((content));
			news.setDate(date);
			news.setAuthor(author);
			return news;
	}
	public static void main(String[] args) {
		DayooNewsAnalyse analyse=new DayooNewsAnalyse();
		String url="http://news.dayoo.com/ent/201106/14/53922_17333522.htm";
		System.out.println(analyse.isDetailPage(url));
		UrlMeta meta=CrawlHTML.responseToURL(url);
		System.out.println((analyse.parserHtml(meta)));
	}
	
	public NewsMeta Update(NewsMeta meta) {
		return null;
	}
	
	public boolean isNeedUpdate(){
		return false;
	}
}
